###############################################################
###############################################################
#### Stefan Müller, Samuel Brazys, and Alexander Dukalskis
#### Replication Scripts for: 
#### Discourse Wars and 'Mask Diplomacy': China's Global Image Management in Times of Crisis 
#### Political Research Exchange, 2024
#### Link to paper: https://doi.org/10.1080/2474736X.2024.2309178
###############################################################
###############################################################

## Note: check the 000_README.pdf file on Harvard Dataverse for 
## the full replication instructions and information on all code scripts.
## Link to Dataverse repository: https://doi.org/10.7910/DVN/KRXMXJ
## Please contact the authors if you have any questions or suggestions. 
## Note: due to copyright restrictions some of the files cannot be shared publicly.
## However, we provide all replication scripts and intermediate objects to reproduce
## the plots and tables included in the paper and Supporting Information.

## This file performs the validation of the LSS scores against human coding.

# load packages
library(dplyr)
library(ggplot2)
library(readr)
library(LSX)
library(quanteda)
library(texreg)
library(stringr)
library(car)
library(ggeffects)


# custom ggplot2 scheme
source("function_theme_base.R")

# load handcoded data 
dat_coded_01 <- read_csv("round_01.csv")
dat_coded_02 <- read_csv("round_02.csv")

dat_coded <- bind_rows(dat_coded_01, dat_coded_02)

nrow(dat_coded)

dfmt_sent <- dat_coded |> 
    corpus() |> 
    tokens() |> 
    dfm() |> 
    dfm_select("^\\p{L}+$", valuetype = "regex",
               min_nchar = 3) 


# load LSS model
lss <- readRDS("data_dontshare/lss_model.rds")

coef(lss)
str(lss)

head(lss$beta)

dat_neg_pos <- data.frame(
    term = names(lss$beta),
    coef = lss$beta,
    freq = lss$frequency
) |> 
    arrange(-coef) |> 
    unique() |> 
    filter(freq > 200)

# select most positive words
dat_neg_pos_top <- top_n(dat_neg_pos, n = 30,
                         wt = coef)|> 
    mutate(type = "Positive")

head(dat_neg_pos_top)

# select most negative words
dat_neg_pos_bottom <- top_n(dat_neg_pos, n = -30,
                            wt = coef) |> 
    mutate(type = "Negative") |> 
    arrange(coef) 

head(dat_neg_pos_bottom)

# load seed words
dat_seed <- read.csv("data_sentiment_seed.csv")

paste(dat_seed$positive, collapse = " ")
paste(dat_seed$negative, collapse = " ")


# Figure A02 
p_top <- ggplot(dat_neg_pos_top, aes(x = reorder(term, 
                                                 coef), 
                                     y = coef)) +
    geom_bar(stat = "identity", fill = "darkgreen") + 
    facet_wrap(~type) +
    scale_y_continuous(breaks = c(seq(0, 0.1, 0.02)),
                       limits = c(0, 0.11)) +
    geom_text(aes(label = term), colour = "white",
              hjust = 1, nudge_y = -0.005, size = 5) +
    coord_flip() +
    labs(x = NULL, y = "Coefficient") +
    theme(legend.position = "none",
          strip.text = element_text(face = "bold"),
          axis.ticks.y = element_blank(),
          axis.text.y = element_blank())
p_top

p_bottom <- ggplot(dat_neg_pos_bottom, aes(x = reorder(term, 
                                                       -coef), 
                                           y = coef)) +
    geom_bar(stat = "identity", fill = "darkred") + 
    facet_wrap(~type) +
    scale_y_continuous(breaks = c(seq(-0.1, 0, 0.02))) +
    geom_text(aes(label = term), colour = "white",
              hjust = 0, nudge_y = 0.005, size = 5) +
    coord_flip() +
    labs(x = NULL, y = "Coefficient") +
    theme(legend.position = "none",
          strip.text = element_text(face = "bold"),
          axis.ticks.y = element_blank(),
          axis.text.y = element_blank())
p_bottom

cowplot::plot_grid(p_bottom, p_top)
ggsave("fig_a02.pdf",
       width = 9, height = 9)
ggsave("fig_a02.png",
       dpi = 300,
       width = 9, height = 9)


head(lss$frequency)

# predict LSS scores
pred <- as.data.frame(predict(lss, 
                              se_fit = TRUE, 
                              newdata = dfmt_sent))


dat_coded$fit <- pred$fit

# recode handcoding to more meaning labels
dat_joined <- dat_coded |> 
    #filter(portrayal_china != "99" & portrayal_china != NA) |> 
    mutate(neg_pos_detailed = car::recode(portrayal_china,
                                          "'-2'='Very Negative';
                                          '-1'='Negative';
                                          '0'='Neutral';
                                          '1'='Positive';
                                          '2'='Very Positive'")) |> 
    mutate(neg_pos_detailed = car::recode(neg_pos_detailed, 
                                          "99='Irrelevant';
                                          NA='Irrelevant'")) |> 
    mutate(neg_pos_merged = ifelse(str_detect(neg_pos_detailed, "Negative"),
                                   "Negative",
                                   ifelse(str_detect(neg_pos_detailed, "Positive"),
                                          "Positive", neg_pos_detailed))) |> 
    filter(neg_pos_detailed != "Irrelevant")


# store data to reproduce Figure 03
save(dat_joined, file = "data_fig_03.Rdata")

load(file = "data_fig_03.Rdata")

# create an aggregated category that changes irrelevant to NA
dat_joined <- dat_joined |> 
    mutate(neg_pos_merged_with_na = ifelse(neg_pos_detailed == "Irrelevant", NA,
                                           neg_pos_detailed))



# change factor levels
levels_detailed <- c("Very Negative", "Negative", "Neutral",
                     "Positive", "Very Positive")

dat_joined$neg_pos_detailed <- factor(dat_joined$neg_pos_detailed,
                                      levels = levels_detailed)


# run regression models and plot predicted values
table(dat_joined$neg_pos_merged_with_na)

lm_validate_merged <- lm(fit ~ neg_pos_merged, 
                         data = dat_joined)

lm_validate_merged_with_na <- lm(fit ~ neg_pos_merged_with_na, 
                                 data = dat_joined)

lm_validate_detailed <- lm(fit ~ neg_pos_detailed, 
                           data = filter(dat_joined, neg_pos_detailed != "Irrelevant"))


eff_neg_pos_merged <- ggpredict(
    lm_validate_merged, 
    terms = c("neg_pos_merged")) |> 
    as.data.frame() |> 
    mutate(x = factor(x, levels = c("Negative", "Neutral", "Positive")))

p_1 <- ggplot(eff_neg_pos_merged, 
              aes(x = x, 
                  y = predicted)) +
    geom_hline(aes(yintercept = 0),
               linetype = "dashed", colour = "grey70") +
    geom_point(size = 3) +
    geom_linerange(aes(ymin = predicted - 1.96 * std.error,
                       ymax = predicted + 1.96 * std.error),
                   size = 0.7) +
    geom_linerange(aes(ymin = predicted - 1.645 * std.error,
                       ymax = predicted + 1.645 * std.error),
                   size = 1.3)  +
    coord_flip() +
    scale_y_continuous(limits = c(-1.2, 2.2),
                       breaks = c(seq(-1, 2, 0.5))) +
    labs(x = "Human Coding", y = "Predicted LSS Score",
         title = "(a) Aggregated Classification") 

p_1


eff_neg_pos_merged_detaild <- ggpredict(
    lm_validate_detailed, terms = c("neg_pos_detailed")) 

eff_neg_pos_merged_detaild$x <- factor(eff_neg_pos_merged_detaild$x,
                                                      levels = c("Very Negative", 
                                                                 "Negative", "Neutral", "Positive",
                                                                 "Very Positive"))

p_2 <- ggplot(eff_neg_pos_merged_detaild, 
              aes(x = x, 
                  y = predicted)) +
    geom_hline(aes(yintercept = 0),
               linetype = "dashed", colour = "grey70") +
    geom_point(size = 2.5) +
    geom_linerange(aes(ymin = predicted - 1.96 * std.error,
                       ymax = predicted + 1.96 * std.error),
                   size = 0.6) +
    geom_linerange(aes(ymin = predicted - 1.645 * std.error,
                       ymax = predicted + 1.645 * std.error),
                   size = 1.2)  +
    coord_flip() +
    scale_y_continuous(limits = c(-1.2, 2.2),
                       breaks = c(seq(-1, 2, 0.5))) +
    labs(x = "Human Coding", y = "Predicted LSS Score",
         title = "(b) Detailed Classification") 
p_2

# Figure 03
cowplot::plot_grid(p_1, p_2, nrow = 1, align = "hv",
                   axis = "r", rel_widths = c(0.48, 0.52))
ggsave("fig_03.png", 
       dpi = 300,
       width = 9, height = 3)
ggsave("fig_03.pdf", 
       width = 9, height = 3)
ggsave("fig_03.eps", 
       device = "eps",
       width = 9, height = 3)
